Import Dataset and Libs

In [88]:
from google.colab import files
files.upload()

Libraries

In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from numpy import where
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, roc_auc_score, roc_curve, recall_score, classification_report
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier

from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE
from lightgbm import LGBMClassifier
from sklearn import metrics
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
# !pip install pycaret
# import pycaret
# from pycaret.utils import enable_colab 
# enable_colab()
In [90]:
!pwd
/content
In [91]:
df = pd.read_csv("/content/task_data.csv")
In [92]:
df.head()
Out[92]:
A B C D E F G H I J K L M N O P Q R S T U V W X y
0 437401 2200.0 3 years 7.40% 68.33 1 6.0 mortgage 58000.0 Not Verified home_improvement 1.43 0.0 770.0 774.0 2.0 12.0 0.0 441.0 2.8% 20.0 804.0 800.0 0.0 1
1 1051124 20000.0 5 years 14.27% 468.17 3 4.0 rent 45000.0 Not Verified debt_consolidation 10.16 0.0 715.0 719.0 0.0 7.0 0.0 15197.0 88.7% 14.0 624.0 620.0 0.0 0
2 885906 24000.0 3 years 7.51% 746.66 1 4.0 mortgage 119500.0 Verified credit_card 6.01 0.0 755.0 759.0 0.0 9.0 0.0 24488.0 40.3% 23.0 774.0 770.0 0.0 1
3 383996 4000.0 3 years 7.68% 124.77 1 0.0 rent 32000.0 Not Verified major_purchase 5.70 0.0 770.0 774.0 2.0 10.0 0.0 7549.0 14.2% 33.0 754.0 750.0 0.0 1
4 454629 4800.0 3 years 8.94% 152.51 1 6.0 mortgage 87000.0 Not Verified debt_consolidation 10.54 0.0 730.0 734.0 1.0 15.0 0.0 27901.0 40.9% 27.0 679.0 675.0 0.0 1
In [93]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       10000 non-null  int64  
 1   B       10000 non-null  float64
 2   C       10000 non-null  object 
 3   D       10000 non-null  object 
 4   E       10000 non-null  float64
 5   F       10000 non-null  int64  
 6   G       9708 non-null   float64
 7   H       10000 non-null  object 
 8   I       10000 non-null  float64
 9   J       10000 non-null  object 
 10  K       10000 non-null  object 
 11  L       10000 non-null  float64
 12  M       10000 non-null  float64
 13  N       10000 non-null  float64
 14  O       10000 non-null  float64
 15  P       10000 non-null  float64
 16  Q       10000 non-null  float64
 17  R       10000 non-null  float64
 18  S       10000 non-null  float64
 19  T       9985 non-null   object 
 20  U       10000 non-null  float64
 21  V       10000 non-null  float64
 22  W       10000 non-null  float64
 23  X       9838 non-null   float64
 24  y       10000 non-null  int64  
dtypes: float64(16), int64(3), object(6)
memory usage: 1.9+ MB
In [94]:
df.columns
Out[94]:
Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
       'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'y'],
      dtype='object')
In [95]:
features = df.columns[:-1]
features
Out[95]:
Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
       'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X'],
      dtype='object')
In [96]:
'''The Spearman correlation evaluates the monotonic relationship between two continuous or ordinal variables. 
In a monotonic relationship, the variables tend to change together, but not necessarily at a constant rate. 
The Spearman correlation coefficient is based on the ranked values for each variable rather than the raw data.'''
plt.figure(figsize=(16, 16))
heatmap = sns.heatmap(np.round(df[features].corr(), 3), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features correlation', fontdict={'fontsize':10}, pad=10)
plt.title("Spearman correlation - train data")
plt.show()
In [97]:
# See Imbalance
pd.value_counts(df['y']).plot.bar()
plt.title('Class histogram')
plt.xlabel('Class')
plt.ylabel('Frequency')
df['y'].value_counts()
Out[97]:
1    8575
0    1425
Name: y, dtype: int64

Preprocessing and Feature Engineering

In [98]:
df.head()
Out[98]:
A B C D E F G H I J K L M N O P Q R S T U V W X y
0 437401 2200.0 3 years 7.40% 68.33 1 6.0 mortgage 58000.0 Not Verified home_improvement 1.43 0.0 770.0 774.0 2.0 12.0 0.0 441.0 2.8% 20.0 804.0 800.0 0.0 1
1 1051124 20000.0 5 years 14.27% 468.17 3 4.0 rent 45000.0 Not Verified debt_consolidation 10.16 0.0 715.0 719.0 0.0 7.0 0.0 15197.0 88.7% 14.0 624.0 620.0 0.0 0
2 885906 24000.0 3 years 7.51% 746.66 1 4.0 mortgage 119500.0 Verified credit_card 6.01 0.0 755.0 759.0 0.0 9.0 0.0 24488.0 40.3% 23.0 774.0 770.0 0.0 1
3 383996 4000.0 3 years 7.68% 124.77 1 0.0 rent 32000.0 Not Verified major_purchase 5.70 0.0 770.0 774.0 2.0 10.0 0.0 7549.0 14.2% 33.0 754.0 750.0 0.0 1
4 454629 4800.0 3 years 8.94% 152.51 1 6.0 mortgage 87000.0 Not Verified debt_consolidation 10.54 0.0 730.0 734.0 1.0 15.0 0.0 27901.0 40.9% 27.0 679.0 675.0 0.0 1

Encode categorical columns and convert to Numeric

In [99]:
# Remove string
df['C'] = df['C'].str[:-5]
df['D'] = df['D'].str[:-1]
df['T'] = df['T'].str[:-1]
# Convert to Numeric
for col in ['C', 'D', 'T']:
  df[col] = pd.to_numeric(df[col])

# for col_name in df.columns:
#     if(df[col_name].dtype == 'object'):
#         df[col_name]= df[col_name].astype('category')
#         df[col_name] = df[col_name].cat.codes
In [100]:
# Normalize
from sklearn.preprocessing import StandardScaler, RobustScaler
features_to_scale = ['A', 'B', 'E', 'L', 'N', 'O', 'Q', 'S', 'T', 'V', 'W']
for col in features_to_scale:
  df[col] = StandardScaler().fit_transform((df[col].values.reshape(-1, 1)))

# RobustScaler is less prone to outliers.
std_scaler = StandardScaler()
rob_scaler = RobustScaler()

for col in ['B', 'D','I','U']:
  df[col] = rob_scaler.fit_transform(df[col].values.reshape(-1,1))
In [101]:
df.head()
Out[101]:
A B C D E F G H I J K L M N O P Q R S T U V W X y
0 -1.170959 -0.821053 3 -0.825926 -1.227406 1 6.0 mortgage -0.022345 Not Verified home_improvement -1.786380 0.0 1.535529 1.535529 2.0 0.603514 0.0 -0.809538 -1.631394 -0.066667 1.425153 1.032411 0.0 1
1 1.771331 1.052632 5 0.446296 0.704809 3 4.0 rent -0.331869 Not Verified debt_consolidation -0.473421 0.0 0.011004 0.011004 0.0 -0.526789 0.0 0.112382 1.390018 -0.466667 -0.873812 -0.515599 0.0 0
2 0.979249 1.473684 3 -0.805556 2.050603 1 4.0 mortgage 1.441940 Verified credit_card -1.097565 0.0 1.119749 1.119749 0.0 -0.074668 0.0 0.692861 -0.312384 0.133333 1.041992 0.774409 0.0 1
3 -1.426991 -0.631579 3 -0.774074 -0.954661 1 0.0 rent -0.641393 Not Verified major_purchase -1.144188 0.0 1.535529 1.535529 2.0 0.151393 0.0 -0.365447 -1.230415 0.800000 0.786551 0.602408 0.0 1
4 -1.088365 -0.547368 3 -0.540741 -0.820608 1 6.0 mortgage 0.668131 Not Verified debt_consolidation -0.416270 0.0 0.426784 0.426784 1.0 1.281695 0.0 0.906097 -0.291280 0.400000 -0.171351 -0.042596 0.0 1

Encode Categorical Features

In [102]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in ['H', 'J', 'K']:
  le.fit(df[col])
  df[col] = le.transform(df[col])
In [103]:
df.head()
Out[103]:
A B C D E F G H I J K L M N O P Q R S T U V W X y
0 -1.170959 -0.821053 3 -0.825926 -1.227406 1 6.0 0 -0.022345 0 4 -1.786380 0.0 1.535529 1.535529 2.0 0.603514 0.0 -0.809538 -1.631394 -0.066667 1.425153 1.032411 0.0 1
1 1.771331 1.052632 5 0.446296 0.704809 3 4.0 4 -0.331869 0 2 -0.473421 0.0 0.011004 0.011004 0.0 -0.526789 0.0 0.112382 1.390018 -0.466667 -0.873812 -0.515599 0.0 0
2 0.979249 1.473684 3 -0.805556 2.050603 1 4.0 0 1.441940 2 1 -1.097565 0.0 1.119749 1.119749 0.0 -0.074668 0.0 0.692861 -0.312384 0.133333 1.041992 0.774409 0.0 1
3 -1.426991 -0.631579 3 -0.774074 -0.954661 1 0.0 4 -0.641393 0 6 -1.144188 0.0 1.535529 1.535529 2.0 0.151393 0.0 -0.365447 -1.230415 0.800000 0.786551 0.602408 0.0 1
4 -1.088365 -0.547368 3 -0.540741 -0.820608 1 6.0 0 0.668131 0 2 -0.416270 0.0 0.426784 0.426784 1.0 1.281695 0.0 0.906097 -0.291280 0.400000 -0.171351 -0.042596 0.0 1

Encode Categorical (Number) Features

In [104]:
for col in ['C', 'F', 'G', 'M', 'P','R', 'X']:
  df[col] = df[col].astype('category').cat.codes
In [105]:
df
Out[105]:
A B C D E F G H I J K L M N O P Q R S T U V W X y
0 -1.170959 -0.821053 0 -0.825926 -1.227406 0 6 0 -0.022345 0 4 -1.786380 0 1.535529 1.535529 2 0.603514 0 -0.809538 -1.631394 -0.066667 1.425153 1.032411 0 1
1 1.771331 1.052632 1 0.446296 0.704809 2 4 4 -0.331869 0 2 -0.473421 0 0.011004 0.011004 0 -0.526789 0 0.112382 1.390018 -0.466667 -0.873812 -0.515599 0 0
2 0.979249 1.473684 0 -0.805556 2.050603 0 4 0 1.441940 2 1 -1.097565 0 1.119749 1.119749 0 -0.074668 0 0.692861 -0.312384 0.133333 1.041992 0.774409 0 1
3 -1.426991 -0.631579 0 -0.774074 -0.954661 0 0 4 -0.641393 0 6 -1.144188 0 1.535529 1.535529 2 0.151393 0 -0.365447 -1.230415 0.800000 0.786551 0.602408 0 1
4 -1.088365 -0.547368 0 -0.540741 -0.820608 0 6 0 0.668131 0 2 -0.416270 0 0.426784 0.426784 1 1.281695 0 0.906097 -0.291280 0.400000 -0.171351 -0.042596 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9995 -0.178557 1.578947 0 -0.277778 2.361137 1 4 4 2.168131 2 2 -0.787749 0 0.149597 0.149597 0 0.151393 0 1.209489 -0.147069 0.600000 0.978132 0.731409 0 1
9996 -0.398053 -0.526316 1 -1.053704 -1.088569 0 3 4 -0.927012 1 6 -0.268882 0 1.535529 1.535529 0 -0.752849 0 -0.767866 -1.371110 -0.600000 -1.129253 -0.687600 0 1
9997 0.992850 0.210526 0 -0.970370 0.222915 0 10 4 0.691940 2 2 -1.178779 0 1.258342 1.258342 0 -0.978909 0 0.008919 -0.829436 -0.800000 0.403390 0.344406 0 1
9998 1.510380 0.526316 0 -0.733333 0.710559 0 3 4 0.501464 1 1 -0.635849 0 0.288191 0.288191 0 0.829574 0 0.174359 -0.470666 0.666667 -1.065393 -0.644600 0 0
9999 0.899920 0.526316 0 -0.733333 0.710559 0 5 4 -0.511869 1 2 -0.228275 0 0.149597 0.149597 0 -0.526789 0 -0.564376 -0.097825 0.466667 0.211810 0.215406 0 1

10000 rows × 25 columns

In [106]:
continuous = ['A', 'B','D','E', 'I', 'L', 'N', 'O', 'Q', 'S','T', 'U', 'V', 'W']
cat = ['C', 'F', 'G', 'H', 'J', 'K', 'M', 'P', 'R', 'X']
In [107]:
def plot_feature_distribution(df1, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(5, 3,figsize=(14, 24))

    for feature in features:
        i += 1
        plt.subplot(5, 3,i)
        sns.distplot(df[feature],color="orange", kde=True,bins=60, label='train')
        plt.xlabel(feature, fontsize=9); plt.legend()
    plt.show();
In [108]:
# Numerical predictors
numerical_predictors = continuous
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=15, cols=2, subplot_titles=numerical_predictors)

for i, col in enumerate(continuous):
    fig.add_trace(
        go.Histogram(x=df[col]), 
        i//2 + 1, i%2 + 1)
    
fig.update_layout(
    title_text='Numerical Predictors Distribution',
    height=1200,
    showlegend=False)

Missing Value Imputation

In [109]:
# df = df.fillna(-999)
# from sklearn.impute import SimpleImputer
# df = SimpleImputer(missing_value = np.nan, strategy = 'mean').fit_transform(df)
# df_imp = df.copy()
In [110]:
df.isna().sum()
Out[110]:
A     0
B     0
C     0
D     0
E     0
F     0
G     0
H     0
I     0
J     0
K     0
L     0
M     0
N     0
O     0
P     0
Q     0
R     0
S     0
T    15
U     0
V     0
W     0
X     0
y     0
dtype: int64
In [111]:
df = df.dropna()
In [112]:
# df
# df_m = df.copy()
# df_bfm = df.copy()
# df = df_m.copy()

Notes

  • Create a 50/50 sub-dataframe ratio of "1" and "0" classes.
  • Determine the Classifiers we are going to use and decide which one has a higher accuracy.

  • Create a Neural Network and compare the accuracy to our best classifier.

We Don't use accuracy score as a metric with imbalanced datasets (will be usually high and misleading), instead use f1-score, precision/recall score or confusion matrix

Outlier Detection

In [113]:
new_df = df.copy()
In [114]:
import seaborn as sns
from scipy.stats import norm

f, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(20, 6))

v14_dist = new_df['A'].loc[new_df['y'] == 1].values
sns.distplot(v14_dist,ax=ax1, fit=norm, color='#FB8861')
ax1.set_title('A Distribution \n (Distribution)', fontsize=14)

v12_dist = new_df['B'].loc[new_df['y'] == 1].values
sns.distplot(v12_dist,ax=ax2, fit=norm, color='#56F9BB')
ax2.set_title('B Distribution \n ( Distribution)', fontsize=14)


v10_dist = new_df['D'].loc[new_df['y'] == 1].values
sns.distplot(v10_dist,ax=ax3, fit=norm, color='#C5B3F9')
ax3.set_title('D Distribution \n ( Distribution)', fontsize=14)

plt.show()
In [115]:
# # -----> A Removing Outliers (Highest Negative Correlated with Labels)
v14_fraud = new_df['A'].loc[new_df['y'] == 1].values
q25, q75 = np.percentile(v14_fraud, 25), np.percentile(v14_fraud, 75)
print('Quartile 25: {} | Quartile 75: {}'.format(q25, q75))
v14_iqr = q75 - q25
print('iqr: {}'.format(v14_iqr))

v14_cut_off = v14_iqr * 1.5
v14_lower, v14_upper = q25 - v14_cut_off, q75 + v14_cut_off
print('Cut Off: {}'.format(v14_cut_off))
print('V14 Lower: {}'.format(v14_lower))
print('V14 Upper: {}'.format(v14_upper))

outliers = [x for x in v14_fraud if x < v14_lower or x > v14_upper]
print('Feature V14 Outliers for Cases: {}'.format(len(outliers)))
print('V10 outliers:{}'.format(outliers))

# new_df = new_df.drop(new_df[(new_df['V14'] > v14_upper) | (new_df['V14'] < v14_lower)].index)
# print('----' * 44)
Quartile 25: -0.7947687628163442 | Quartile 75: 0.717209148956256
iqr: 1.5119779117726002
Cut Off: 2.2679668676589
V14 Lower: -3.0627356304752444
V14 Upper: 2.9851760166151564
Feature V14 Outliers for Cases: 0
V10 outliers:[]
In [116]:
df.columns
Out[116]:
Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
       'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'y'],
      dtype='object')
In [117]:
import matplotlib.gridspec as gridspec

plt.figure(figsize=(12,28*4))
gs = gridspec.GridSpec(28, 1)
for i, cn in enumerate(continuous):
    ax = plt.subplot(gs[i])
    sns.distplot(df[cn][df.y == 1], bins=50)
    sns.distplot(df[cn][df.y == 0], bins=50)
    ax.set_xlabel('')
    ax.set_title('histogram of feature: ' + str(cn))
plt.show()
In [118]:
# Distribution
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig = make_subplots(rows=2, cols=1, shared_xaxes=True)

fig.append_trace(
    go.Histogram(x=df['y']), 
    1, 1)

fig.append_trace(
    go.Box(x=df['y'], name='y'),
    2, 1)

fig.update_layout(title_text='Target Distribution', showlegend=False)
In [119]:
# Numerical predictors

numerical_predictors = continuous
fig = make_subplots(rows=20, cols=2, subplot_titles=numerical_predictors)

for i, col in enumerate(numerical_predictors):
    fig.add_trace(
        go.Histogram(x=df[col]), 
        i//2 + 1, i%2 + 1)
    
fig.update_layout(
    title_text='Numerical Predictors Distribution',
    height=1200,
    showlegend=False)
In [120]:
# Categorical predictors
# categorical_predictors = [p for p in predictors if 'cat' in p]
categorical_predictors = cat
fig = make_subplots(rows=5, cols=2, subplot_titles=categorical_predictors)

for i, col in enumerate(categorical_predictors):
    fig.add_trace(
        go.Histogram(x=df[col]), 
        i//2 + 1, i%2 + 1)
    
fig.update_layout(
    title_text='Categorical Predictors Distribution',
    height=1000,
    showlegend=False)

Feature Importance - PCA

In [121]:
ex_df = df.copy()
In [122]:
# ex_df = ex_df.fillna(-999)
# Scaled Data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(ex_df.drop('y',axis=1))
Out[122]:
StandardScaler(copy=True, with_mean=True, with_std=True)
In [123]:
scaled_data = scaler.transform(ex_df.drop('y',axis=1))
In [124]:
# Dimensionality Reduction and Clustering:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(scaled_data)
Out[124]:
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)
In [125]:
x_pca = pca.transform(scaled_data)
In [126]:
x_pca.shape
Out[126]:
(9985, 2)
In [127]:
plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0],x_pca[:,1],c=ex_df['y'])
plt.xlabel('First principal component')
plt.ylabel('Second Principal Component')
Out[127]:
Text(0, 0.5, 'Second Principal Component')

Defining Target and Predictors

In [128]:
X = np.array(df.drop(['y'], axis=1))
y = np.array(df['y'])
print('Shape of X: {}'.format(X.shape))
print('Shape of y: {}'.format(y.shape))
Shape of X: (9985, 24)
Shape of y: (9985,)

Oversampling with SMOTE

In [129]:
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)
Number transactions X_train dataset:  (7988, 24)
Number transactions y_train dataset:  (7988,)
Number transactions X_test dataset:  (1997, 24)
Number transactions y_test dataset:  (1997,)
In [130]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))
Before OverSampling, counts of label '1': 6841
Before OverSampling, counts of label '0': 1147 

After OverSampling, the shape of train_X: (13682, 24)
After OverSampling, the shape of train_y: (13682,) 

After OverSampling, counts of label '1': 6841
After OverSampling, counts of label '0': 6841
In [131]:
# !pip install pycaret
# import pycaret
# from pycaret.utils import enable_colab 
# enable_colab()

Modeling

Logistic Regression with GridsearchCV

In [132]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, roc_auc_score, roc_curve, recall_score, classification_report

parameters = { 'C': np.linspace(1, 10, 10) }

lr = LogisticRegression()
clf = GridSearchCV(lr, parameters, cv=5, verbose=5, n_jobs=3)
clf.fit(X_train_res, y_train_res.ravel())
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    2.0s
[Parallel(n_jobs=3)]: Done  50 out of  50 | elapsed:    6.9s finished
Out[132]:
GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=3,
             param_grid={'C': array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=5)
In [133]:
clf.best_params_
Out[133]:
{'C': 6.0}
In [134]:
lr1 = LogisticRegression(C=6,penalty='l2', verbose=5)
lr1.fit(X_train_res, y_train_res.ravel())
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s finished
Out[134]:
LogisticRegression(C=6, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=5,
                   warm_start=False)
In [135]:
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        1#print('Confusion matrix, without normalization')

    #print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
In [136]:
y_train_pre = lr1.predict(X_train_res)
cnf_matrix_tra = confusion_matrix(y_train_res, y_train_pre)
print("Recall metric in the train dataset: {}%".format(100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])))
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix_tra , classes=class_names, title='Confusion matrix')
plt.show()
Recall metric in the train dataset: 84.43210057009209%
In [137]:
y_pre = lr1.predict(X_test)

cnf_matrix = confusion_matrix(y_test, y_pre)

print("Recall metric in the testing dataset: {}%".format(100*cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])))
#print("Precision metric in the testing dataset: {}%".format(100*cnf_matrix[0,0]/(cnf_matrix[0,0]+cnf_matrix[1,0])))
# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix , classes=class_names, title='Confusion matrix')
plt.show()
Recall metric in the testing dataset: 85.43238537434706%
In [138]:
tmp = lr1.fit(X_train_res, y_train_res.ravel())
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s finished
In [139]:
y_pred_sample_score = tmp.decision_function(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_sample_score)
roc_auc = auc(fpr,tpr)

# Plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
print("ROC AUC score: ", roc_auc)
ROC AUC score:  0.9025867291390419
In [140]:
# style.use('ggplot')
sns.set_style('whitegrid')
plt.subplots(figsize = (30,30))
## Plotting heatmap. Generate a mask for the upper triangle (taken from seaborn example gallery)
mask = np.zeros_like(df.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(df.corr(), cmap=sns.diverging_palette(20, 220, n=200), annot=True, mask=mask, center = 0, );
plt.title("Heatmap of all the Features of Train data set", fontsize = 25);
In [141]:
#visualizing the features w high negative correlation
f, axes = plt.subplots(nrows=3, ncols=3, figsize=(25,15))

f.suptitle('Features With High Negative Correlation', size=35)
sns.boxplot(x="y", y="A", data=df, ax=axes[0,0])
sns.boxplot(x="y", y="B", data=df, ax=axes[0,1])
sns.boxplot(x="y", y="C", data=df, ax=axes[0,2])
sns.boxplot(x="y", y="D", data=df, ax=axes[1,0])
sns.boxplot(x="y", y="F", data=df, ax=axes[1,1])
sns.boxplot(x="y", y="G", data=df, ax=axes[1,2])
sns.boxplot(x="y", y="H", data=df, ax=axes[2,0])
sns.boxplot(x="y", y="I", data=df, ax=axes[2,1])

sns.boxplot(x="y", y="U", data=df, ax=axes[2,1])

f.delaxes(axes[2,2])
In [142]:
#visualizing the features w high positive correlation
f, axes = plt.subplots(nrows=1, ncols=5, figsize=(14,5))

f.suptitle('Features With High Positive Correlation', size=20)
sns.boxplot(x="y", y="N", data=df, ax=axes[0])
sns.boxplot(x="y", y="O", data=df, ax=axes[1])
sns.boxplot(x="y", y="V", data=df, ax=axes[2])
sns.boxplot(x="y", y="W", data=df, ax=axes[3])
sns.boxplot(x="y", y="U", data=df, ax=axes[4])
Out[142]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd3447fe790>
In [143]:
def plot_feature_distribution(df1, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(5, 3,figsize=(14, 24))

    for feature in features:
        i += 1
        plt.subplot(5, 3,i)
        sns.distplot(df[feature],color="orange", kde=True,bins=60, label='train')
        plt.xlabel(feature, fontsize=9); plt.legend()
    plt.show();
plot_feature_distribution(df,continuous)
<Figure size 432x288 with 0 Axes>

Plot Categorical

In [144]:
# plt.style.use("ggplot")
plt.figure(figsize=(25,20))
for i,feature in enumerate(cat):
    plt.subplot(2,5,i+1)
    sns.countplot(df[feature])
In [145]:
def distribution3(feature,category,df=df):
    plt.subplots(figsize=(15, 7))
    sns.histplot(df,x=feature,hue=category)
In [146]:
distribution3('D','H')
In [147]:
def boxploting1(feature,category,df=df,figure_size=(15,7)):
    plt.subplots(figsize=figure_size)
    sns.boxplot(x=feature, y=category, data=df,whis=[0, 100], width=.6, palette="vlag")

LogisticRegression() LinearDiscriminantAnalysis() KNeighborsClassifier() RandomForestClassifier() DecisionTreeClassifier()

XGBClassifier() GaussianNB() GradientBoostingClassifier() LGBMClassifier()

In [148]:
def Definedata():
    # define dataset
    X=df.drop(columns=['y']).values
    y=df['y'].values
    return X, y
In [149]:
def SMOTE():
    # borderline-SMOTE for imbalanced dataset
    from collections import Counter
    from sklearn.model_selection import train_test_split
    from sklearn.datasets import make_classification
    from imblearn.over_sampling import SMOTE
    from matplotlib import pyplot
    from numpy import where
    
    X, y = Definedata()

# summarize class distribution
    counter = Counter(y)
    print(counter)
# transform the dataset
    smt = SMOTE(random_state=0)
    X, y = smt.fit_sample(X, y) 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=2)
# summarize the new class distribution
    counter = Counter(y)
    print(counter)
# scatter plot of examples by class label
    for label, _ in counter.items():
        row_ix = where(y == label)[0]
        pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label))
    pyplot.legend()
    pyplot.show()
    return X_train, X_test, y_train, y_test
In [150]:
# SMOTE()
# SMOTE() = (X_train1, X_test1, y_train1, y_test1)
In [151]:
def Models(models, X_train, X_test, y_train, y_test, title):
    model = models
    model.fit(X_train,y_train)
    
    X, y = Definedata()
    train_matrix = pd.crosstab(y_train, model.predict(X_train), rownames=['Actual'], colnames=['Predicted'])    
    test_matrix = pd.crosstab(y_test, model.predict(X_test), rownames=['Actual'], colnames=['Predicted'])
    matrix = pd.crosstab(y, model.predict(X), rownames=['Actual'], colnames=['Predicted'])
    
    f,(ax1,ax2,ax3) = plt.subplots(1,3,sharey=True, figsize=(20, 3))
    #f = plt.figure(figsize=(20, 3))
    
    g1 = sns.heatmap(train_matrix, annot=True, fmt=".1f", cbar=False,annot_kws={"size": 16},ax=ax1)
    g1.set_title(title)
    g1.set_ylabel('Total = {}'.format(y_train.sum()), fontsize=14, rotation=90)
    g1.set_xlabel('Recall Accuracy score for Trainingset: {}'.format(recall_score(model.predict(X_train), y_train)))
    
    g2 = sns.heatmap(test_matrix, annot=True, fmt=".1f",cbar=False,annot_kws={"size": 16},ax=ax2)
    g2.set_ylabel('Total = {}'.format(y_test.sum()), fontsize=14, rotation=90)
    g2.set_xlabel('Recall Accuracy score for Testingset: {}'.format(recall_score(model.predict(X_test), y_test)))

    g3 = sns.heatmap(matrix, annot=True, fmt=".1f",cbar=False,annot_kws={"size": 16},ax=ax3)
    g3.set_ylabel('Total = {}'.format(y.sum()), fontsize=14, rotation=90)
    g3.set_xlabel('Recall Accuracy score for Totalset: {}'.format(recall_score(model.predict(X), y)))
    
    plt.show()
    return y, model.predict(X)
    
def Featureimportances(models, X_train, y_train):
    model = models
    model.fit(X_train,y_train)
    importances = model.feature_importances_
    features = df.columns[:24]
    imp = pd.DataFrame({'Features': features, 'Importance': importances})
    imp['Sum Importance'] = imp['Importance'].cumsum()
    imp = imp.sort_values(by = 'Importance')
    return imp

Logistic Regression without GridSearchCV

In [152]:
title = 'LogisticRegression/SMOTE'
%time Models(LogisticRegression(),X_train_res, X_test, y_train_res, y_test, title)
CPU times: user 994 ms, sys: 643 ms, total: 1.64 s
Wall time: 914 ms
Out[152]:
(array([1, 0, 1, ..., 1, 0, 1]), array([1, 0, 1, ..., 1, 0, 1]))

XGBoost Classifier

In [153]:
title = 'GradientBoostingClassifier/SMOTE'
%time Models(GradientBoostingClassifier(n_estimators=500, learning_rate=1, max_features=2, max_depth=2, random_state=0),X_train_res, X_test, y_train_res, y_test, title)
CPU times: user 4.29 s, sys: 173 ms, total: 4.46 s
Wall time: 4.3 s
Out[153]:
(array([1, 0, 1, ..., 1, 0, 1]), array([1, 0, 1, ..., 1, 0, 1]))
In [154]:
%time Featureimportances(GradientBoostingClassifier(n_estimators=500, learning_rate=1, max_features=2, max_depth=2, random_state=0),X_train_res, y_train_res)
CPU times: user 3.81 s, sys: 3.33 ms, total: 3.82 s
Wall time: 3.82 s
Out[154]:
Features Importance Sum Importance
7 H 0.001603 0.188211
17 R 0.003420 0.472484
6 G 0.005188 0.186608
8 I 0.005785 0.193997
16 Q 0.005785 0.469064
18 S 0.005840 0.478324
11 L 0.006914 0.351752
1 B 0.007479 0.020006
19 T 0.008026 0.486350
20 U 0.009635 0.495985
4 E 0.010897 0.163683
23 X 0.012039 1.000000
0 A 0.012527 0.012527
12 M 0.014231 0.365983
14 O 0.014242 0.401258
5 F 0.017737 0.181420
2 C 0.019812 0.039818
13 N 0.021033 0.387016
10 K 0.023588 0.344838
15 P 0.062020 0.463279
3 D 0.112968 0.152786
9 J 0.127254 0.321250
21 V 0.131872 0.627858
22 W 0.360103 0.987961
In [155]:
imp = %time Featureimportances(GradientBoostingClassifier(n_estimators=500, learning_rate=1, max_features=2, max_depth=2, random_state=0),X_train_res, y_train_res)
CPU times: user 3.33 s, sys: 1.48 ms, total: 3.33 s
Wall time: 3.33 s
In [156]:
tmp = pd.DataFrame({'Feature': features, 'Feature importance': imp['Importance']})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()   

Feature Importance

Note:

If prediction set will have the X feature, then keeping X in the train set wouldnt be a problem in terms of Data Leakage. Otherwise, obviously, this would be problematic to rely on this model's performance. In the first case, probably I'd go for k-fold stratified cross-validation with pipeline build.

Multi Layer Perceptron

6. Predict and evaluate

In [157]:
from sklearn.metrics import recall_score
from sklearn.neural_network import MLPClassifier

MLPC = MLPClassifier(hidden_layer_sizes=(200,), max_iter=10000)
MLPC.fit(X_train, y_train)
y_pred = MLPC.predict(X_test)
recall_acc = recall_score (y_test,y_pred)
recall_acc
Out[157]:
0.926871735345328

I have other methods left to test out, such as TabNet, conversion of data to 3D for CNNs, an optimized NN model and few more. I can further can be improved to 96+, with more effort on feature engineering and optimized modelling.

Questions

1. Correlation

In [158]:
df['B'].corr(df['I'])
Out[158]:
0.1951525013098839

1.1 Max correlations

In [159]:
df.corr().unstack().sort_values(ascending=False).drop_duplicates().head(10)
Out[159]:
y  y    1.000000
F  D    0.947671
E  B    0.926965
W  V    0.853008
X  R    0.705114
Q  U    0.679557
y  V    0.518332
D  T    0.470337
W  y    0.464346
T  F    0.447076
dtype: float64

1.3 Max correlations visually

In [160]:
corr = df.corr()
In [161]:
kot = corr[corr >= 0.8]
plt.figure(figsize=(12, 8))
sns.heatmap(kot, cmap='Greens')
Out[161]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd345f14ad0>

1.2 Role of correlation:

Correlation defines how the one feature corresponds to the other, on the goal of predicting the output alltogether with other variable. For example, for some datasets having features whose relationship is linear, we need to use a Linear model to corectly define the relationship(while avioiding multicollinearity), without needing to go for any Deeper Models. And also it is always better to choose most contributing feature, rather than more feature (PCA and t-sne is used for dim reduction and visually seeing relations) and thus, avoid adding unnecesaary noise which can mislead the model.

2. Distribution -

Already Normalized features for any case. Obviously, for linear models, we need to normalize, while tree based machines doesnt react to any non-normal distribution. We can use few normalization technics to scale and normalize data such as scaling, clipping, log scaling...

In [162]:
import matplotlib.gridspec as gridspec

plt.figure(figsize=(12,28*4))
gs = gridspec.GridSpec(28, 1)
for i, cn in enumerate(['U']):
    ax = plt.subplot(gs[i])
    sns.distplot(df[cn][df.y == 1], bins=50)
    sns.distplot(df[cn][df.y == 0], bins=50)
    ax.set_xlabel('')
    ax.set_title('histogram of feature: ' + str(cn))
plt.show()
In [163]:
def plot_feature_distribution(df1, features):
    i = 0
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(1, 1,figsize=(12, 10))

    for feature in features:
        i += 1
        plt.subplot(1, 1,i)
        sns.distplot(df[feature],color="orange", kde=True,bins=60, label='data')
        plt.xlabel(feature, fontsize=9); plt.legend()
    plt.show();
plot_feature_distribution(df,'U')
<Figure size 432x288 with 0 Axes>

2.1 Statistical Properties

In [184]:
print("Mean value of Column U: ",df['U'].mean(), "\n", "STD value of Column U: ",df['U'].std())
Mean value of Column U:  0.08431981305291293 
 STD value of Column U:  0.7627640930884917
In [185]:
df['U'].describe()
Out[185]:
count    9985.000000
mean        0.084320
std         0.762764
min        -1.266667
25%        -0.466667
50%         0.000000
75%         0.533333
max         4.400000
Name: U, dtype: float64

2.2 Normalization Technics

Such as, MinMaxScaler, StandardScaler, Box-cox, Log Scaling. etc can be used

3. Interdependence

3.1 Correlation

In [164]:
df['D'].corr(df['H'])
Out[164]:
0.06875412417068767

3.2 Correlation coefficient - P value

In [168]:
np.corrcoef(df['D'], df['H'] )
Out[168]:
array([[1.        , 0.06875412],
       [0.06875412, 1.        ]])

0.06 shows a positive weak linear relationship between two relationship. We can also confirm the realationship with Linear Regression correlation test

In [178]:
# from scipy.stats import linregress
# linregress(df['H'],df['D'])
In [175]:
# Finding distribution of values for each cat value
# It seems that for the less amount of D column, more values were with Mortgage
pd.crosstab(df['D'], df['H'])
Out[175]:
H 0 1 2 3 4
D
-1.192593 84 0 0 8 52
-1.124074 57 0 0 8 43
-1.087037 57 0 0 5 36
-1.085185 2 0 0 0 4
-1.079630 70 0 0 12 31
... ... ... ... ... ...
2.103704 0 0 0 0 1
2.159259 1 0 0 0 0
2.231481 0 0 0 0 1
2.268519 1 0 0 0 0
2.357407 1 0 0 0 0

330 rows × 5 columns

3.3 Chi-squared test of independence

In [177]:
from scipy.stats import chi2_contingency
chi2_contingency(pd.crosstab(df['D'], df['H']))
Out[177]:
(3724.3166557430845,
 1.768878637916804e-228,
 1316,
 array([[6.43204807e+01, 1.44216324e-02, 4.03805709e-01, 1.05277917e+01,
         6.87335003e+01],
        [4.82403605e+01, 1.08162243e-02, 3.02854281e-01, 7.89584377e+00,
         5.15501252e+01],
        [4.37736605e+01, 9.81472208e-03, 2.74812218e-01, 7.16474712e+00,
         4.67769654e+01],
        ...,
        [4.46670005e-01, 1.00150225e-04, 2.80420631e-03, 7.31096645e-02,
         4.77315974e-01],
        [4.46670005e-01, 1.00150225e-04, 2.80420631e-03, 7.31096645e-02,
         4.77315974e-01],
        [4.46670005e-01, 1.00150225e-04, 2.80420631e-03, 7.31096645e-02,
         4.77315974e-01]]))

As seen, p-value of the result of the Chi-squared test is less than 0.05 ( 1.768878637916804e-228), which means we reject the null hypothesis that column D is not assisiated with column H. It seems that for the less amount of D column, more values were with Mortgage

Feature Selection

4.1 Technics

For Feature selection, few methods like, Filter, wrapper, and embedded methods exit. Simply using correlation of variables or defining variance threshold can also help to find out more weighted variables contributing to the model.

Ill show one example from each type:

In [192]:
from sklearn.feature_selection import VarianceThreshold
thres = VarianceThreshold(threshold=0)
thres.fit(X)
thres.get_support()
Out[192]:
array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True])
In [196]:
from sklearn.feature_selection import mutual_info_classif
plt.figure(figsize=(12, 8))
importance = mutual_info_classif(X, y)
feature_importance = pd.Series(importance, df.columns[0: len(df.columns)-1])
feature_importance.plot(kind='barh')
Out[196]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd33c433490>

Exhaustive Feature Selection, and Lasso can also be used

5.1 Feature Importance

In [186]:
tmp = pd.DataFrame({'Feature': features, 'Feature importance': imp['Importance']})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()   

Note:

If prediction set will have the X feature, then keeping X in the train set wouldnt be a problem in terms of Data Leakage. Otherwise, obviously, this would be problematic to rely on this model's performance. In the first case, probably I'd go for k-fold stratified cross-validation with pipeline build.